import pandas as pd
import numpy as np
import seaborn as sns
import pylab as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号, 注意['SimHei']对应这句不行.
import os
file_dir = r'C:\Users\11730\OneDrive - City University of Hong Kong\Desktop\datav3\datav3\爬虫\分小时数据'
#构建新的表格名称
new_filename = file_dir + '\\new_file.csv'
df = pd.read_csv(new_filename,encoding='gb18030')
df.head()
len(df)
import jieba
list(jieba.cut(df['热搜词'][0], cut_all=False))
list(jieba.cut(df['热搜词'][1], cut_all=False))
list(jieba.cut(df['热搜词'][2], cut_all=False))
!pip install thulac
pip list
import thulac
thu1 = thulac.thulac() #默认模式
text = thu1.cut("易烊千玺太A了", text=True) #进行一句话分词
print(text)
text = thu1.cut("王一博水中唱跳", text=False) #进行一句话分词
print(text)
def cut_words(txt):
text = thu1.cut(txt, text=False)
text = [i for i,j in text]
return text
# flush print
import sys
def flushPrint(d):
sys.stdout.write('\r')
sys.stdout.write(str(d))
sys.stdout.flush()
cut_words('王一博水中唱跳')
wlist = []
for k, i in enumerate(df['热搜词']):
if k % 100 ==0:
flushPrint(k)
text = cut_words(i)
wlist.append(text)
df['wlist'] = wlist
df
df.to_csv(new_filename, index = False)
print(*df['热搜词'][:100])
df[10000:]
def successive_list(alist):
return [[alist[:-1][i], alist[1:][i]] for i in range(len(alist)-1)]
import networkx as nx
import itertools
G=nx.Graph()
for slist in wlist:
if len(slist)>1:
slist = [i for i in slist if len(i) > 1]
edgelist = successive_list(slist)
for e1, e2 in edgelist:
G.add_edge(e1, e2)
nx.info(G)
pr = nx.pagerank(G, alpha=0.9)
deg = dict(nx.degree_centrality(G))
dei = dict(nx.eigenvector_centrality(G))
dd = [(i, deg[i], dei[i], pr[i]) for i in deg]
dd = pd.DataFrame(dd, columns = ('behavior', 'Centrality','Eigenvector Centrality', 'PageRank'))
dd = dd.sort_values(by=['Centrality'], ascending = False)
dd = dd.reset_index()
dd[:20]
dd[20:40]
print(*dd['behavior'][:200])
plt.figure(figsize = (6, 6))
plt.style.use('ggplot')
sns.scatterplot(data=dd, x="Centrality", y="PageRank")
plt.xlabel('$Centrality$', fontsize = 20)
plt.ylabel('$PageRank$', fontsize = 20)
plt.show()
from collections import defaultdict
import numpy as np
def plotDegreeDistribution(G):
degs = defaultdict(int)
for i in dict(G.degree()).values(): degs[i]+=1
items = sorted ( degs.items () )
x, y = np.array(items).T
y_sum = np.sum(y)
y = [float(i)/y_sum for i in y]
plt.plot(x, y, 'bo')
plt.xscale('log')
plt.yscale('log')
#plt.legend(['Degree'])
plt.xlabel('$K$', fontsize = 20)
plt.ylabel('$P(K)$', fontsize = 20)
plt.title('$Degree\,Distribution$', fontsize = 20)
plt.show()
plt.style.use('ggplot')
plt.figure(figsize = (6, 6))
plotDegreeDistribution(G)
G.remove_edges_from(nx.selfloop_edges(G))
g = nx.k_core(G,k=16)
nx.info(g)
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号, 注意['SimHei']对应这句不行.
plt.figure(figsize = (16, 16), dpi = 300)
nx.draw(g, with_labels = True, edge_color="grey", node_color='blue', rotate = True, node_size = 26)
plt.margins(x=0.2)